library(tidyverse)
## ─ Attaching packages ───────────────────── tidyverse 1.2.1 ─
## ✔ ggplot2 3.0.0     ✔ purrr   0.2.5
## ✔ tibble  1.4.2     ✔ dplyr   0.7.8
## ✔ tidyr   0.8.1     ✔ stringr 1.3.1
## ✔ readr   1.1.1     ✔ forcats 0.3.0
## ─ Conflicts ─────────────────────── tidyverse_conflicts() ─
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(dplyr)
library(stringr)
library(readr)
library(readxl)

library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(maps)
## 
## Attaching package: 'maps'
## The following object is masked from 'package:purrr':
## 
##     map
library(maps)

Read original data

heart_disease_stratify =  read_csv("data/Heart_Disease_Mortality_Data_Among_US_Adults__35___by_State_Territory_and_County.csv") %>%
  janitor::clean_names() %>%
  rename(state = location_abbr) %>%
  rename(mortality_rate = data_value) %>%

  mutate(state = state.name[match(state, state.abb)]) %>% 
  select(-data_source, -geographic_level, -class, -topic, -data_value_footnote, -data_value_footnote_symbol, -topic_id, -location_id ) 
## Parsed with column specification:
## cols(
##   Year = col_integer(),
##   LocationAbbr = col_character(),
##   LocationDesc = col_character(),
##   GeographicLevel = col_character(),
##   DataSource = col_character(),
##   Class = col_character(),
##   Topic = col_character(),
##   Data_Value = col_double(),
##   Data_Value_Unit = col_character(),
##   Data_Value_Type = col_character(),
##   Data_Value_Footnote_Symbol = col_character(),
##   Data_Value_Footnote = col_character(),
##   StratificationCategory1 = col_character(),
##   Stratification1 = col_character(),
##   StratificationCategory2 = col_character(),
##   Stratification2 = col_character(),
##   TopicID = col_character(),
##   LocationID = col_character(),
##   `Location 1` = col_character()
## )
heart_disease = heart_disease_stratify %>% 
  filter(stratification1 == "Overall", stratification2 == "Overall") %>% 
  select(-stratification1, -stratification2, -stratification_category1, -stratification_category2) 

heart_disease$mortality_rate[is.na(heart_disease$mortality_rate)] = 0

heart_disease = 
heart_disease %>% 
  group_by(state) %>% 
  summarise(mortality_rate = mean(mortality_rate))

Add air quality data

airquality_2015 = read_csv("data/airquality.csv") %>%
  janitor::clean_names() %>%
  select(state, pm2_5) %>% 
  group_by(state) %>% 
  summarize(pm2.5 = sum(pm2_5))
## Parsed with column specification:
## cols(
##   State = col_character(),
##   County = col_character(),
##   Year = col_integer(),
##   `Days with AQI` = col_integer(),
##   `Good Days` = col_integer(),
##   `Moderate Days` = col_integer(),
##   `Unhealthy for Sensitive Groups Days` = col_integer(),
##   `Unhealthy Days` = col_integer(),
##   `Very Unhealthy Days` = col_integer(),
##   `Hazardous Days` = col_integer(),
##   `Max AQI` = col_integer(),
##   `90th Percentile AQI` = col_integer(),
##   `Median AQI` = col_integer(),
##   `Days CO` = col_integer(),
##   `Days NO2` = col_integer(),
##   `Days Ozone` = col_integer(),
##   `Days SO2` = col_integer(),
##   PM2.5 = col_integer(),
##   `Days PM10` = col_integer()
## )

Add obesity data

obesity_data = read_csv("data/National_Obesity_By_State.csv") %>%
  janitor::clean_names() %>%
  rename(state = name) %>%
  rename(obesity_percentage = obesity) %>%
  select(state, obesity_percentage) 
## Parsed with column specification:
## cols(
##   OBJECTID = col_integer(),
##   NAME = col_character(),
##   Obesity = col_double(),
##   Shape__Area = col_double(),
##   Shape__Length = col_double()
## )
data_with_obesity = left_join(heart_disease, obesity_data)
## Joining, by = "state"

Add stroke data

stroke_data = read_csv("data/Stroke_Mortality_Data_Among_US_Adults__35___by_State_Territory_and_County.csv") %>%
  janitor::clean_names() %>%
  rename(stroke_value=data_value)%>%
  rename(state = location_abbr) %>%
  mutate(state = state.name[match(state, state.abb)])%>%

  select(state,stroke_value) %>% 
  group_by(state) %>% 
  filter(!is.na(stroke_value)) %>% 
  summarize(stroke_value = sum(stroke_value)) 
## Parsed with column specification:
## cols(
##   Year = col_integer(),
##   LocationAbbr = col_character(),
##   LocationDesc = col_character(),
##   GeographicLevel = col_character(),
##   DataSource = col_character(),
##   Class = col_character(),
##   Topic = col_character(),
##   Data_Value = col_double(),
##   Data_Value_Unit = col_character(),
##   Data_Value_Type = col_character(),
##   Data_Value_Footnote_Symbol = col_character(),
##   Data_Value_Footnote = col_character(),
##   StratificationCategory1 = col_character(),
##   Stratification1 = col_character(),
##   StratificationCategory2 = col_character(),
##   Stratification2 = col_character(),
##   TopicID = col_character(),
##   LocationID = col_character(),
##   `Location 1` = col_character()
## )

Add income

income_data = read_excel("data/income_2015.xlsx", range = "A4:D55") %>%
  janitor::clean_names() %>%
  rename(state = united_states, median_income = x55117, income_standard_error = x253) 
data_with_income = left_join(heart_disease,income_data, by = "state")
data_income_obesity = left_join(income_data,data_with_obesity, by = "state")


smoking_data = read_csv("data/smoking.csv") %>% 
  filter(YEAR == "2015-2016") %>% 
  mutate(year = 2015) %>% 
  rename(state = LocationDesc) %>% 
  select(-YEAR) %>% 

  select(year, state, Data_Value) %>% 

  select(year, state, Data_Value) %>% 
  rename(tobacco_comsumption = Data_Value) %>% 
  group_by(state) %>% 
  summarise(tobacco_consumption = sum(tobacco_comsumption))
## Parsed with column specification:
## cols(
##   .default = col_character(),
##   Data_Value = col_double(),
##   Data_Value_Std_Err = col_double(),
##   Low_Confidence_Limit = col_double(),
##   High_Confidence_Limit = col_double(),
##   Sample_Size = col_integer(),
##   DisplayOrder = col_integer()
## )
## See spec(...) for full column specifications.
data_income_obesity_smoking = left_join(smoking_data, data_income_obesity, by = "state")

data_income_obesity_smoking_air = left_join(airquality_2015, data_income_obesity_smoking, by = "state")




data_income_obesity_smoking = left_join(smoking_data, data_income_obesity, by = "state")
data_income_obesity_smoking_air = left_join(airquality_2015, data_income_obesity_smoking, by = "state")


final_data = left_join(stroke_data, data_income_obesity_smoking_air, by = "state") 

Find the association between smoke and heart disease mortality

  final_data %>%
  mutate(state = forcats::fct_reorder(factor(state), tobacco_consumption)) %>%
  ggplot(aes(x = mortality_rate, y = tobacco_consumption)) + 
  geom_point(aes(color = state), alpha = .5) +
  labs(
    title = "Tabacco Consumption Accross states"
  ) +
  theme(text = element_text(size = 8), axis.text = element_text(angle = 60, hjust = 1), legend.position = "bottom")
## Warning: Removed 14 rows containing missing values (geom_point).

lm(mortality_rate~tobacco_consumption, data = final_data) %>%
summary()
## 
## Call:
## lm(formula = mortality_rate ~ tobacco_consumption, data = final_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -124.235  -46.749    0.038   40.108  115.435 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         241.8406    45.2029   5.350 5.56e-06 ***
## tobacco_consumption   0.8384     0.3669   2.285   0.0285 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 62.37 on 35 degrees of freedom
##   (14 observations deleted due to missingness)
## Multiple R-squared:  0.1298, Adjusted R-squared:  0.105 
## F-statistic: 5.222 on 1 and 35 DF,  p-value: 0.02848

Find the association between income and heart disease mortality

final_data_income = 
  final_data %>% 
  mutate(state = forcats::fct_reorder(factor(state), median_income)) 

final_data_income %>% 
  ggplot(aes(x = mortality_rate, y = median_income, color = state)) +
  geom_point() +
  theme(text = element_text(size = 8), axis.text.x = element_text(angle = 60, hjust = 1), legend.position = "bottom") + 

  #Add the title and the name for x and y axis. 
  labs(
    title = "Association between Income and Heart Disease Mortality Rate",
    x = "Mortality Rate",
    y = "median_income"
  )
## Warning: Removed 1 rows containing missing values (geom_point).

lm(mortality_rate ~ median_income, data = final_data_income) %>% 
  summary()
## 
## Call:
## lm(formula = mortality_rate ~ median_income, data = final_data_income)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -80.924 -29.753  -7.434  28.817 102.081 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    6.102e+02  3.994e+01  15.279  < 2e-16 ***
## median_income -4.826e-03  7.058e-04  -6.838  1.3e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 44.39 on 48 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.4935, Adjusted R-squared:  0.4829 
## F-statistic: 46.76 on 1 and 48 DF,  p-value: 1.303e-08

From the lm result, we can observe that median_income is a very significant variable with a p value of 1.3e-08. This indicates there is a strong association between income and heart disease mortality rate

Find the association between airquality and heart disease mortality

## make scatterplot 
final_data %>% 
  mutate(state = fct_reorder(state, mortality_rate)) %>% 
  ggplot(aes(x = mortality_rate, y = pm2.5, color = state)) + 
  geom_point() +
  ggtitle("Airquality VS Mortality Rate ") +
  theme(legend.position = "bottom",
        legend.direction = "horizontal",
        axis.text.x = element_text(angle = 90, size = 6),
        legend.key.size = unit(0.05, "cm")) + 
  labs(x = "Mortality Rate",
       y = "PM2.5") 
## Warning: Removed 1 rows containing missing values (geom_point).

## fit simple linear regression model 
air_regression<-lm(final_data$mortality_rate~final_data$pm2.5) 
summary(air_regression)
## 
## Call:
## lm(formula = final_data$mortality_rate ~ final_data$pm2.5)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -89.08 -38.02 -15.49  34.66 139.55 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      3.382e+02  1.427e+01  23.698   <2e-16 ***
## final_data$pm2.5 9.734e-04  4.673e-03   0.208    0.836    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 62.34 on 48 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.0009031,  Adjusted R-squared:  -0.01991 
## F-statistic: 0.04339 on 1 and 48 DF,  p-value: 0.8359

From the scatterplot, we can see that the points are spread randomlly. However, the relationship between pm2.5 and mortality rate is unclear. For the states, with low pm2.5, some of them have low mortality rate and some of them have high mortality rate. After we fit the simple regression model, the p-value for pm2.5 is 0.836, so it is a non-significant variable.

Map

library(plotly)
map_data = final_data %>% 
    mutate(state = tolower(state)) 


states <- map_data("state") %>% 
  rename(state = region)
  
a = left_join(states, map_data, by = "state") %>% 
  mutate(text_label = str_c("Region: ", state, 'Mortality rate: ', mortality_rate) ) 

a$text_label <- with(a, paste(state, '<br>', "Mortality_rate", mortality_rate))
# give state boundaries a white border
l <- list(color = toRGB("white"), width = 2)
# specify some map projection/options
g <- list(
  scope = 'usa',
  projection = list(type = 'albers usa'),
  showlakes = TRUE,
  lakecolor = toRGB('white')
)

p <- plot_geo(a, locationmode = 'USA-states') %>%
  add_trace(
    z = ~mortality_rate, text = ~text_label, locations = ~us,
    color = ~mortality_rate, colors = 'Purples'
  ) %>%
  colorbar(title = "Millions USD") %>%
  layout(
    title = '2011 US Agriculture Exports by State<br>(Hover for breakdown)',
    geo = g
  )
## Warning: Ignoring 10 observations
p